By using more than 3 million shopping baskets data provided by Instacart, we are going to find the products that re commonly bought together.
library(arules)
library(arulesViz)
library(dplyr)
# import the first 20000 rows of orders
orders <- read.csv("order_products__prior.csv", nrows = 20000)
# import all products rows
products <- read.csv("products.csv")
# check the imported data
head(orders)
head(products)
# have a glimpse on the data
glimpse(orders)
Observations: 20,000
Variables: 4
$ order_id [3m[38;5;246m<int>[39m[23m 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 4…
$ product_id [3m[38;5;246m<int>[39m[23m 33120, 28985, 9327, 45918, 30035, 17794, 40141, 1819…
$ add_to_cart_order [3m[38;5;246m<int>[39m[23m 1, 2, 3, 4, 5, 6, 7, 8, 9, 1, 2, 3, 4, 5, 6, 7, 8, 1…
$ reordered [3m[38;5;246m<int>[39m[23m 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0…
glimpse(products)
Observations: 49,688
Variables: 4
$ product_id [3m[38;5;246m<int>[39m[23m 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 1…
$ product_name [3m[38;5;246m<fct>[39m[23m Chocolate Sandwich Cookies, All-Seasons Salt, Robust Gol…
$ aisle_id [3m[38;5;246m<int>[39m[23m 61, 104, 94, 38, 5, 11, 98, 116, 120, 115, 31, 119, 11, …
$ department_id [3m[38;5;246m<int>[39m[23m 19, 13, 7, 1, 13, 11, 7, 1, 16, 7, 7, 1, 11, 17, 18, 19,…
The two datasets both have a variable called product_id. So we can join the two datasets on this variable to combine the datasets.
order_basket <- orders %>%
inner_join(products, by="product_id") %>%
group_by(order_id) %>%
summarise(basket = as.vector(list(product_name)))
head(order_basket)
order_basket[1,]$basket
[[1]]
[1] Organic Egg Whites
[2] Michigan Organic Kale
[3] Garlic Powder
[4] Coconut Butter
[5] Natural Sweetener
[6] Carrots
[7] Original Unflavored Gelatine Mix
[8] All Natural No Stir Creamy Almond Butter
[9] Classic Blend Cole Slaw
49688 Levels: .5\\" Waterproof Tape ... ZZZQuil Vanilla Cherry Nighttime Liquid Sleep Aid
df <- as(order_basket$basket, "transactions")
head(df)
transactions in sparse format with
6 transactions (rows) and
6984 items (columns)
dim(df)
[1] 4978 2
summary(df)
transactions as itemMatrix in sparse format with
2031 rows (elements/itemsets/transactions) and
6984 columns (items) and a density of 0.001409989
most frequent items:
Banana Bag of Organic Bananas Organic Strawberries
321 236 163
Organic Baby Spinach Organic Hass Avocado (Other)
137 129 19014
element (itemset/transaction) length distribution:
sizes
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20
93 113 136 135 171 152 144 122 123 118 97 52 79 68 57 47 43 47 33 29
21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40
24 21 13 14 25 13 10 12 4 3 4 1 5 4 1 3 2 2 2 1
41 44 45 46 52 60 61
2 1 1 1 1 1 1
Min. 1st Qu. Median Mean 3rd Qu. Max.
1.000 5.000 8.000 9.847 13.000 61.000
includes extended item information - examples:
inspect(df[1:3])
items
[1] {All Natural No Stir Creamy Almond Butter,
Carrots,
Classic Blend Cole Slaw,
Coconut Butter,
Garlic Powder,
Michigan Organic Kale,
Natural Sweetener,
Organic Egg Whites,
Original Unflavored Gelatine Mix}
[2] {Air Chilled Organic Boneless Skinless Chicken Breasts,
Lemons,
Organic Baby Spinach,
Organic Ezekiel 49 Bread Cinnamon Raisin,
Organic Ginger Root,
Total 2% with Strawberry Lowfat Greek Strained Yogurt,
Unsweetened Almondmilk,
Unsweetened Chocolate Almond Breeze Almond Milk}
[3] {Chewy 25% Low Sugar Chocolate Chip Granola,
Energy Drink,
Goldfish Cheddar Baked Snack Crackers,
Honey/Lemon Cough Drops,
Kellogg's Nutri-Grain Apple Cinnamon Cereal,
Kellogg's Nutri-Grain Blueberry Cereal,
Nutri-Grain Soft Baked Strawberry Cereal Breakfast Bars,
Oats & Chocolate Chewy Bars,
Original Orange Juice,
Plain Pre-Sliced Bagels,
Sugarfree Energy Drink,
Tiny Twists Pretzels,
Traditional Snack Mix}
itemFrequencyPlot(df, topN = 10, type = "absolute", main = "Top 10 Most Frequent Items")
# create first association rules
rules <- apriori(df, parameter = list(target="frequent itemsets", support=0.008, minlen=2))
Apriori
Parameter specification:
Algorithmic control:
Absolute minimum support count: 16
set item appearances ...[0 item(s)] done [0.00s].
set transactions ...[6984 item(s), 2031 transaction(s)] done [0.01s].
sorting and recoding items ... [139 item(s)] done [0.00s].
creating transaction tree ... done [0.00s].
checking subsets of size 1 2 3 done [0.00s].
writing ... [25 set(s)] done [0.00s].
creating S4 object ... done [0.00s].
# sort rules by support
rules_support <- sort(rules, by="support", decreasing = TRUE)
# check the first few rules
inspect(rules_support[1:5])
# use a different support level - 0.01
rules_2 <- apriori(df, parameter = list(target="frequent itemsets", support=0.01, minlen=2))
Apriori
Parameter specification:
Algorithmic control:
Absolute minimum support count: 20
set item appearances ...[0 item(s)] done [0.00s].
set transactions ...[6984 item(s), 2031 transaction(s)] done [0.01s].
sorting and recoding items ... [108 item(s)] done [0.00s].
creating transaction tree ... done [0.00s].
checking subsets of size 1 2 3 done [0.00s].
writing ... [13 set(s)] done [0.00s].
creating S4 object ... done [0.00s].
# sort rules by support
rules_support_2 <- sort(rules_2, by="support", decreasing = TRUE)
# check the most bought itemset
inspect(rules_support_2[1])
rules_df <- DATAFRAME(rules_support)
head(rules_df)
library(ggplot2)
top_10 <- arrange(rules_df, support)
top_10$items <- factor(top_10$items, levels = top_10$items)
ggplot(top_10, aes(x = items, y = support)) + geom_col() +
coord_flip()
# create association rules
as_rules <- apriori(df, parameter = list(supp = 0.002, conf = 0.6, minlen=2, maxlen=3))
Apriori
Parameter specification:
Algorithmic control:
Absolute minimum support count: 4
set item appearances ...[0 item(s)] done [0.00s].
set transactions ...[6984 item(s), 2031 transaction(s)] done [0.01s].
sorting and recoding items ... [824 item(s)] done [0.00s].
creating transaction tree ... done [0.00s].
checking subsets of size 1 2 3
Mining stopped (maxlen reached). Only patterns up to a length of 3 returned!
done [0.01s].
writing ... [12 rule(s)] done [0.00s].
creating S4 object ... done [0.01s].
summary(as_rules)
set of 12 rules
rule length distribution (lhs + rhs):sizes
2 3
7 5
Min. 1st Qu. Median Mean 3rd Qu. Max.
2.000 2.000 2.000 2.417 3.000 3.000
summary of quality measures:
support confidence lift count
Min. :0.002462 Min. :0.6000 Min. : 3.954 Min. :5.000
1st Qu.:0.002462 1st Qu.:0.6250 1st Qu.: 4.218 1st Qu.:5.000
Median :0.002708 Median :0.7083 Median : 5.150 Median :5.500
Mean :0.002790 Mean :0.7456 Mean : 32.762 Mean :5.667
3rd Qu.:0.002954 3rd Qu.:0.7917 3rd Qu.: 7.740 3rd Qu.:6.000
Max. :0.003447 Max. :1.0000 Max. :169.250 Max. :7.000
mining info:
inspect(head(as_rules))
lhs rhs support confidence lift count
[1] {Blueberry on the Bottom Nonfat Greek Yogurt} => {Peach on the Bottom Nonfat Greek Yogurt} 0.002461841 1.0000000 169.250000 5
[2] {Non Fat Black Cherry on the Bottom Greek Yogurt} => {Peach on the Bottom Nonfat Greek Yogurt} 0.002461841 1.0000000 169.250000 5
[3] {Chicken Breast Tenders Breaded} => {Banana} 0.002461841 0.6250000 3.954439 5
[4] {Organic Strawberry Fruit Spread} => {Bag of Organic Bananas} 0.002461841 0.8333333 7.171610 5
[5] {Organic Butternut Squash} => {Organic Hass Avocado} 0.002954210 0.6000000 9.446512 6
[6] {Strawberry Preserves} => {Banana} 0.003446578 0.7777778 4.921080 7
df_as_rules <- DATAFRAME(as_rules)
head(df_as_rules)
df_as_rules$rule <- paste(df_as_rules$LHS, " => ", df_as_rules$RHS)
head(df_as_rules)
# Create a scatterplot with support and confidence on the x and y axis (respectively), and lift as the shading of the dots
plot(as_rules,method = 'scatterplot', measure = c("support","confidence"), shading = "lift")
# Create a graph plot for the first 10 rules
plot(as_rules, method = "graph", control = list(cex=0.5))
# Sort the rules by confidence, most confident rules on top and print out the first 10
inspect(sort(as_rules, by = "confidence")[1:10])
lhs rhs support confidence lift count
[1] {Blueberry on the Bottom Nonfat Greek Yogurt} => {Peach on the Bottom Nonfat Greek Yogurt} 0.002461841 1.0000000 169.250000 5
[2] {Non Fat Black Cherry on the Bottom Greek Yogurt} => {Peach on the Bottom Nonfat Greek Yogurt} 0.002461841 1.0000000 169.250000 5
[3] {Organic Strawberry Fruit Spread} => {Bag of Organic Bananas} 0.002461841 0.8333333 7.171610 5
[4] {Strawberry Preserves} => {Banana} 0.003446578 0.7777778 4.921080 7
[5] {Boneless Skinless Chicken Breasts,
Organic Baby Spinach} => {Banana} 0.003446578 0.7777778 4.921080 7
[6] {Organic Baby Arugula,
Original Hummus} => {Bag of Organic Bananas} 0.002954210 0.7500000 6.454449 6
[7] {Raspberry Preserves} => {Banana} 0.002954210 0.6666667 4.218069 6
[8] {Honeycrisp Apple,
Strawberries} => {Banana} 0.002954210 0.6666667 4.218069 6
[9] {Chicken Breast Tenders Breaded} => {Banana} 0.002461841 0.6250000 3.954439 5
[10] {Organic Grape Tomatoes,
Organic Hass Avocado} => {Bag of Organic Bananas} 0.002461841 0.6250000 5.378708 5
# Inspect the (first few) rules sorted by lift
inspect(sort(as_rules, by = "lift")[1:5])
lhs rhs support confidence lift count
[1] {Blueberry on the Bottom Nonfat Greek Yogurt} => {Peach on the Bottom Nonfat Greek Yogurt} 0.002461841 1.0000000 169.250000 5
[2] {Non Fat Black Cherry on the Bottom Greek Yogurt} => {Peach on the Bottom Nonfat Greek Yogurt} 0.002461841 1.0000000 169.250000 5
[3] {Organic Butternut Squash} => {Organic Hass Avocado} 0.002954210 0.6000000 9.446512 6
[4] {Organic Strawberry Fruit Spread} => {Bag of Organic Bananas} 0.002461841 0.8333333 7.171610 5
[5] {Organic Baby Arugula,
Original Hummus} => {Bag of Organic Bananas} 0.002954210 0.7500000 6.454449 6
# Create a subset of the rules called lime_rules and sort it by decreasing lift.
rules = apriori(data=df, parameter=list(support=0.003,confidence=0.2, minlen=2, maxlen=3))
Apriori
Parameter specification:
Algorithmic control:
Absolute minimum support count: 6
set item appearances ...[0 item(s)] done [0.00s].
set transactions ...[6984 item(s), 2031 transaction(s)] done [0.03s].
sorting and recoding items ... [487 item(s)] done [0.00s].
creating transaction tree ... done [0.00s].
checking subsets of size 1 2 3
Mining stopped (maxlen reached). Only patterns up to a length of 3 returned!
done [0.00s].
writing ... [172 rule(s)] done [0.00s].
creating S4 object ... done [0.00s].
lime_rules = sort(
subset(rules, subset=rhs %in% 'Limes'),
by = 'lift',decreasing = T
)
inspect(lime_rules)
lime_rules_2 <- apriori(df, parameter = list(support=0.003,confidence=0.2, minlen=2, maxlen=3), appearance = list(default = "lhs", rhs = "Limes"))
Apriori
Parameter specification:
Algorithmic control:
Absolute minimum support count: 6
set item appearances ...[1 item(s)] done [0.00s].
set transactions ...[6984 item(s), 2031 transaction(s)] done [0.01s].
sorting and recoding items ... [487 item(s)] done [0.00s].
creating transaction tree ... done [0.00s].
checking subsets of size 1 2 3
Mining stopped (maxlen reached). Only patterns up to a length of 3 returned!
done [0.00s].
writing ... [4 rule(s)] done [0.00s].
creating S4 object ... done [0.00s].
inspect(lime_rules_2)
plot(lime_rules_2, method="graph",control = list(cex=0.5))